Summary

This script takes the raw data downloaded from Crimson Hexagon and cleans it up for analysis. At the bottom of the script there are a few exploratory maps.

Setup

knitr::opts_chunk$set(message = F, warning = F)

library(tidyverse)
library(jsonlite)
library(ggmap)
library(leaflet)
library(sf)
library(readxl)
library(reticulate)
library(RColorBrewer)
library(kableExtra)
library(mapview)

Data cleaning

Crimson Hexagon data is saved in two day bulk exports. The CH website only allows exports of 10,000 randomly selected tweets. There seemed to be between 10-15k over any 2 day period so data was exported in 2-day chunks to try and get as much data as possible. Two filters were applied to the data before downloading - the location was set to Santa Barbara (this does not mean the tweet was geotagged but that it came from the area) and that it was an Original Tweet (not a retweet).

# list all .xlsx files
xl_files <- list.files("../data/daily", pattern = ".xlsx", full.names = TRUE)

ids <- data.frame()

for(i in 1:length(xl_files)){
  print(i)
  #get twitter IDs from the Crimson Hexagon output
ch_data <- read_excel(xl_files[i], skip = 1) %>%
  select(GUID)
  
ids <- rbind(ch_data, ids)
}

nums <- seq(1, nrow(ids), length.out = 30)

for(i in 1:29){
  
  n <- nums[i]
  n2 <- nums[i+1]
  df <- ids[n:n2,]
  
#save as .txt file to be read by the python twarc library
write.table(as.numeric(df$GUID), file = paste0("../data/twitter_ids_", i, ".txt"), sep = "\t",
            row.names = FALSE, col.names = FALSE)
}

Now I use the python library, twarc in my terminal to “hydrate” the data using the tweet IDs. The Crimson Hexagon data does not give us much information but the twarc library lets us use the twitter id to grab a lot more information (including coordinates for geotagged tweets).

Once this is done, all tweets are saved in a JSON file.

# Give the input file name to the function.# 

tweets1 <- stream_in(file("../data/tweets1.jsonl")) 
tweets2 <- stream_in(file("../data/tweets2.jsonl")) 
tweets3 <- stream_in(file("../data/tweets3.jsonl")) 
tweets4 <- stream_in(file("../data/tweets4.jsonl")) 
tweets5 <- stream_in(file("../data/tweets5.jsonl")) 
tweets6 <- stream_in(file("../data/tweets6.jsonl")) 
tweets7 <- stream_in(file("../data/tweets7.jsonl")) 
tweets8 <- stream_in(file("../data/tweets8.jsonl")) 
tweets9 <- stream_in(file("../data/tweets9.jsonl")) 
tweets10 <- stream_in(file("../data/tweets10.jsonl")) 
tweets11 <- stream_in(file("../data/tweets11.jsonl")) 
tweets12 <- stream_in(file("../data/tweets12.jsonl")) 
tweets13 <- stream_in(file("../data/tweets13.jsonl")) 
tweets14 <- stream_in(file("../data/tweets14.jsonl")) 
tweets15 <- stream_in(file("../data/tweets15.jsonl")) 
tweets16 <- stream_in(file("../data/tweets16.jsonl")) 
tweets17 <- stream_in(file("../data/tweets17.jsonl")) 
tweets18 <- stream_in(file("../data/tweets18.jsonl")) 
tweets19 <- stream_in(file("../data/tweets19.jsonl")) 
tweets20 <- stream_in(file("../data/tweets20.jsonl")) 
tweets21 <- stream_in(file("../data/tweets21.jsonl")) 
tweets22 <- stream_in(file("../data/tweets22.jsonl")) 
tweets23 <- stream_in(file("../data/tweets23.jsonl")) 
tweets24 <- stream_in(file("../data/tweets24.jsonl")) 
tweets25 <- stream_in(file("../data/tweets25.jsonl")) 
tweets26 <- stream_in(file("../data/tweets26.jsonl")) 
tweets27 <- stream_in(file("../data/tweets27.jsonl")) 
tweets28 <- stream_in(file("../data/tweets28.jsonl")) 
tweets29 <- stream_in(file("../data/tweets29.jsonl")) 
create_tweet_df <- function(tweets){

  
#get the columns we want from the json (some are nested)
tweet_df <- as_tibble(cbind(
as.character(tweets$created_at),
as.numeric(tweets$id_str),
as.character(tweets$full_text),
as.numeric(tweets$user$id_str),
as.character(tweets$user$location),
as.character(tweets$geo$type),
as.character(tweets$geo$coordinates),
as.character(tweets$lang),
as.numeric(tweets$retweet_count),
as.numeric(tweets$favorite_count)))

#assign column names
names(tweet_df) <- c("created_at","tweet_id","full_text","user_id","user_location",
              "geo_type", "geo_coordinates", "language", "retweet_count", "favorite_count")

## filter
tweets_geo <- tweet_df %>%
  filter(!is.na(geo_type)) %>%
  mutate(tweet_id = as.numeric(tweet_id),
         user_id = as.numeric(user_id),
         retweet_count = as.numeric(retweet_count),
         favorite_count = as.numeric(favorite_count))

return(tweets_geo)
}

Apply function

df1 <- create_tweet_df(tweets1)
df2 <- create_tweet_df(tweets2)
df3 <- create_tweet_df(tweets3)
df4 <- create_tweet_df(tweets4)
df5 <- create_tweet_df(tweets5)
df6 <- create_tweet_df(tweets6)
df7 <- create_tweet_df(tweets7)
df8 <- create_tweet_df(tweets8)
df9 <- create_tweet_df(tweets9)
df10 <- create_tweet_df(tweets10)
df11 <- create_tweet_df(tweets11)
df12 <- create_tweet_df(tweets12)
df13 <- create_tweet_df(tweets13)
df14 <- create_tweet_df(tweets14)
df15 <- create_tweet_df(tweets15)
df16 <- create_tweet_df(tweets16)
df17 <- create_tweet_df(tweets17)
df18 <- create_tweet_df(tweets18)
df19 <- create_tweet_df(tweets19)
df20 <- create_tweet_df(tweets20)
df21 <- create_tweet_df(tweets21)
df22 <- create_tweet_df(tweets22)
df23 <- create_tweet_df(tweets23)
df24 <- create_tweet_df(tweets24)
df25 <- create_tweet_df(tweets25)
df26 <- create_tweet_df(tweets26)
df27 <- create_tweet_df(tweets27)
df28 <- create_tweet_df(tweets28)
df29 <- create_tweet_df(tweets29)

Combine

all_df <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14,df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26, df27, df28, df29) 

Remove points outside of our bounding box, which is c(-119.9,34.38,-119.5,34.48)

# create new df with just the tweet texts & usernames
tweet_data <- all_df %>%
    mutate(coords = gsub("\\)|c\\(", "", geo_coordinates)) %>%
    separate(coords, c("lat", "lon"), sep = ", ") %>%
    mutate_at(c("lon", "lat"), as.numeric) %>%
   filter(lat >=33.88 & lat <= 34.6,
          lon <= -119.5 & lon >= -120.5)

write_csv(tweet_data, "../data/geotagged_sb_tweets.csv")

Map tweets

Map of all tweets

Turn the tweet_df_w_user_type data frame into a spatial object.

tweet_data <- read_csv("../data/geotagged_sb_tweets.csv")

tweet_sf <- tweet_data %>%
  st_as_sf(coords = c("lon", "lat")) %>%
  st_set_crs(4326)

Interactive with cluster markers

#map
map <- leaflet(tweet_data) %>%
  # Base groups
  addProviderTiles(providers$CartoDB.Positron) %>%
  # Overlay groups %>%
    addCircleMarkers(data = tweet_data, lng = ~lon, lat = ~lat, popup = ~full_text,
                   radius = 3, stroke = FALSE, fillOpacity = 0.5, clusterOptions = markerClusterOptions())  
mapshot(map, "../figs/all_tweet_map_cluster_markers.html")

map

Static map of downtown

cols      = c(brewer.pal(9,"OrRd")[2:9])
register_google(Sys.getenv("GOOGLE_ACCESS_TOKEN"))

#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 14, maptype = "toner-lite") 

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
                                    500, 1000, 2500, Inf))), bins=150) +
       scale_fill_manual(
        values = cols,
        labels = c("<5", "5-9", "10-49 ", "50-99 ",
                   "100-499 ", "500-999 ", "1000-2499 ", "2500+")
    ) +
  labs(fill = "# Tweets",
       title = "Tweets in Santa Barbara 2015-2019")

ggsave("../figs/all_tweets_sb_static_hex_map.png")

Static map of whole area

#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 11, maptype = "toner-lite") 

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
                                    500, 1000, 2500, Inf))), bins=150) +
       scale_fill_manual(
        values = cols,
        labels = c("<5 ", "5-9", "10-49 ", "50-99 ",
                   "100-499 ", "500-999 ", "1000-2499 ", "2500+")
    ) +
  labs(fill = "# Tweets",
       title = "Tweets in larger SB area 2015-2019")

Interactive hex density

Get hex density by overlaying with points

hex_grid <- read_sf("../data/sb_area_hexagons.shp")

hex_tweet_count <- hex_grid %>%
  mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)))

mapview(hex_tweet_count %>% filter(tweet_count > 0), zcol = "tweet_count", layer.name = "# tweets")

Why are there so many tweets near De La Vina and Arrellaga hospital? Let’s take a closer look at tweets by geo_coordinates

geo_tweets <- tweet_data %>%
  group_by(geo_coordinates) %>%
  summarize(count = n()) %>%
  arrange(desc(count))

head(geo_tweets)
## # A tibble: 6 x 2
##   geo_coordinates               count
##   <chr>                         <int>
## 1 c(34.4258, -119.714)          11583
## 2 c(34.42, -119.7)               2062
## 3 c(34.4337, -119.632)            916
## 4 c(34.41938, -119.69905)         831
## 5 c(34.39916667, -119.51638889)   706
## 6 c(34.4405, -119.814)            513

So one coordinate has 11,489 tweets from it. The next highest is just 2019 tweets.

sb.zoom.map <- get_map(location = c( -119.7158247, 34.4262342), zoom = 17, maptype = "toner-lite")

ggmap(sb.zoom.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, bins = 50)

The light blue point is equal to the coordinates c(34.4258, -119.714). I think this is the default coord when someone tags Santa Barbara. First clue is that there is nothing of significance at this location, it is a residential area. Let’s take a look at a handful of tweets coming from here.

delavina_tweets <- tweet_data %>%
  filter(geo_coordinates == "c(34.4258, -119.714)")

kable(sample_n(delavina_tweets, 10)) %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 10, fixed_thead = T)
created_at tweet_id full_text user_id user_location geo_type geo_coordinates language retweet_count favorite_count lat lon
Tue Aug 16 06:32:18 +0000 2016 7.654360e+17 DAY 10: shopping in #cambria , visiting #morrobay to see more #sealions , more shopping in… https://t.co/ATUVccZl21 20236584 Liverpool, England Point c(34.4258, -119.714) en 0 1 34.4258 -119.714
Fri Jun 28 21:43:32 +0000 2019 1.144723e+18 🦋The Only Late Realization Of Being Vegan Is That Life Is Better Eating Clean/Raw/ And Knowing That Transformation Is Not An Easy Task But So Worthy Because Wellness Is The Complete Integration Of Connecting Mind… https://t.co/nxy1HfE0lc 266079193 Santa Barbara Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Sat Jan 14 05:45:08 +0000 2017 8.201447e+17 Mom’s DELICIOUS Dairy-free mint chocolate chip ice cream. The best batch yet! Would you believe… https://t.co/bLZmK6Rpmq 32197449 Santa Barbara, California Point c(34.4258, -119.714) en 0 3 34.4258 -119.714
Wed Nov 25 00:42:05 +0000 2015 6.693151e+17 The sky up here is amazing! @ Santa Barbara, California https://t.co/4sVJ7FVj80 29305862 Santa Barbara, CA Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Mon Nov 21 23:31:19 +0000 2016 8.008441e+17 Santa Barbara I love you🌊🌴 @ Santa Barbara, California https://t.co/uAu0kEgsST 433913850 San Diego, CA Point c(34.4258, -119.714) en 0 1 34.4258 -119.714
Mon Jun 25 16:22:08 +0000 2018 1.011283e+18 You are a ray of sunshine on a cloudy day♥️ @ Santa Barbara, California https://t.co/MoLGRN0ifY 176148809 KSA / Jordan / Palestine Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Fri Jul 28 19:31:00 +0000 2017 8.910182e+17 #HappyBirthdayMom #mom #momlife you are always on my mind. R.I.P ♥️ July.28 today is My Mother… https://t.co/IhthplT0hy 165527324 Buffalo,New York Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Fri Oct 07 17:39:00 +0000 2016 7.844480e+17 Two blue-eyed England girls hanging out on the wee one’s first birthday! #LookyThere!… https://t.co/qDReRQvda9 26617510 Vista, CA USA Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Tue May 17 04:44:08 +0000 2016 7.324315e+17 taking a walk on the beach @amgentoc #sanddollars are everywhere #doepicshit #sunset #getoutside… https://t.co/nIUpOGJGWa 16132082 United States Point c(34.4258, -119.714) en 0 0 34.4258 -119.714
Mon Nov 14 00:20:40 +0000 2016 7.979574e+17 My Days: 11/13/2016: ➡" #MarketingByJewminatti "⬅️ #Artist #Wizard #Magic #WritersOfIG… https://t.co/bM3Re9Y8lk 224387002 Cerritos, CA Point c(34.4258, -119.714) en 0 0 34.4258 -119.714

Look at the log of tweet count.

hex_tweet_count <- hex_grid %>%
  mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)),
         log_tweet_count = log(tweet_count),
         bin = case_when(
           tweet_count < 10 ~ 10,
           tweet_count >= 10 & tweet_count < 50 ~ 50,
           tweet_count >= 50 & tweet_count < 100 ~ 100,
           tweet_count >= 100 & tweet_count < 500 ~ 500,
           tweet_count >= 500 & tweet_count < 1000 ~ 1000,
           tweet_count >= 1000 & tweet_count < 2000 ~ 2000,
           tweet_count >= 2000 ~ 2001
         ))

log_hex_map <-mapview(hex_tweet_count %>% filter(tweet_count > 0), #remove hexes with no tweets
                 zcol = "log_tweet_count", layer.name = "Tweet count (log)")

mapshot(log_hex_map, "../figs/sb_tweets_log_hex_map.html")

log_hex_map